# read in the data and rename the column name to align with cloud data in the data dictionaries
image1<- as.data.frame(read.table("image_data/image1.txt"))
image1$picture<- rep("image1", nrow(image1))
colnames(image1) [1:11] <- c("y_coordinate", "x_coordinate", "label", "NDAI", "SD", "CORR", "DF_angle", "CF_angle" , "BF_angle", "AF_angle", "AN_angle")

image2<- as.data.frame(read.table("image_data/image2.txt"))
image2$picture<- rep("image2", nrow(image2))
colnames(image2) [1:11] <- c("y_coordinate", "x_coordinate", "label", "NDAI", "SD", "CORR", "DF_angle", "CF_angle" , "BF_angle", "AF_angle", "AN_angle")

image3<- as.data.frame(read.table("image_data/image3.txt"))
image3$picture<- rep("image3", nrow(image3))
colnames(image3) [1:11] <- c("y_coordinate", "x_coordinate", "label", "NDAI", "SD", "CORR", "DF_angle", "CF_angle" , "BF_angle", "AF_angle", "AN_angle")

Problem 1b Summarize Data

total_df <- rbind(image1, image2, image3)

ggplot(image1)+ geom_point(alpha = 0.5, aes(x= x_coordinate, y = y_coordinate, 
                                            color = factor(label)))+
  xlab(" x coordinate") +
  ylab(" y coordinate") +
  ggtitle("Coordinate Map image1 with label ")+ 
  theme_bw()

ggplot(image2)+ geom_point(alpha = 0.5, aes(x= x_coordinate, y = y_coordinate,
                                            color = factor(label)))+
  xlab(" x coordinate") +
  ylab(" y coordinate") + 
  ggtitle("Coordinate Map image2 with label ")+ 
  theme_bw()

ggplot(image3)+ geom_point(alpha = 0.5, aes(x= x_coordinate, y = y_coordinate, 
                                            color = factor(label)))+
  xlab(" x coordinate") + 
  ylab(" y coordinate") +
  ggtitle("Coordinate Map image3 with label ")+ 
  theme_bw()

#Check if there's any missing data
sum(is.na(total_df))
## [1] 0
#There is no missing value 
summary(total_df)
##   y_coordinate    x_coordinate       label              NDAI        
##  Min.   :  2.0   Min.   : 65.0   Min.   :-1.0000   Min.   :-1.8420  
##  1st Qu.: 98.0   1st Qu.:143.0   1st Qu.:-1.0000   1st Qu.:-0.4286  
##  Median :193.0   Median :218.0   Median : 0.0000   Median : 1.3476  
##  Mean   :193.1   Mean   :218.1   Mean   :-0.1334   Mean   : 1.0847  
##  3rd Qu.:289.0   3rd Qu.:294.0   3rd Qu.: 0.0000   3rd Qu.: 2.3142  
##  Max.   :383.0   Max.   :369.0   Max.   : 1.0000   Max.   : 4.5639  
##        SD                CORR            DF_angle         CF_angle     
##  Min.   :  0.1987   Min.   :-0.3872   Min.   : 45.28   Min.   : 31.19  
##  1st Qu.:  1.6376   1st Qu.: 0.1253   1st Qu.:244.56   1st Qu.:219.27  
##  Median :  4.3095   Median : 0.1603   Median :281.91   Median :259.31  
##  Mean   :  8.0633   Mean   : 0.1860   Mean   :271.36   Mean   :246.37  
##  3rd Qu.: 10.2264   3rd Qu.: 0.2231   3rd Qu.:300.39   3rd Qu.:279.59  
##  Max.   :117.5810   Max.   : 0.8144   Max.   :410.53   Max.   :360.68  
##     BF_angle         AF_angle         AN_angle        picture         
##  Min.   : 24.49   Min.   : 21.07   Min.   : 20.57   Length:345556     
##  1st Qu.:200.79   1st Qu.:185.16   1st Qu.:174.88   Class :character  
##  Median :236.17   Median :211.54   Median :197.58   Mode  :character  
##  Mean   :224.20   Mean   :201.71   Mean   :188.29                     
##  3rd Qu.:258.62   3rd Qu.:235.15   3rd Qu.:216.80                     
##  Max.   :335.08   Max.   :318.70   Max.   :306.93
#Calculate the proportion 
percentage_label<- total_df %>%
  group_by(picture, label) %>%
  summarise (n = n()) %>%
  mutate(freq = n / sum(n))
percentage_label
## # A tibble: 9 x 4
## # Groups:   picture [3]
##   picture label     n  freq
##   <chr>   <dbl> <int> <dbl>
## 1 image1     -1 50446 0.438
## 2 image1      0 44312 0.385
## 3 image1      1 20471 0.178
## 4 image2     -1 42882 0.373
## 5 image2      0 32962 0.286
## 6 image2      1 39266 0.341
## 7 image3     -1 33752 0.293
## 8 image3      0 60221 0.523
## 9 image3      1 21244 0.184
g1 <- ggplot(data=percentage_label[percentage_label$picture=='image1',], 
  aes(x=label, y=freq)) +
  geom_bar(stat="identity", color="black", fill="lightskyblue3")+
  ylab("Proportion")+
  ylim(0,0.6)
g2 <- ggplot(data=percentage_label[percentage_label$picture=='image2',], 
  aes(x=label, y=freq)) +
  geom_bar(stat="identity", color="black", fill="lightskyblue3")+
  ylab(NULL)+
  ylim(0,0.6)
g3 <- ggplot(data=percentage_label[percentage_label$picture=='image3',], 
  aes(x=label, y=freq)) +
  geom_bar(stat="identity", color="black", fill="lightskyblue3")+
  ylab(NULL)+
  ylim(0,0.6)
grid.arrange(g1,g2,g3,ncol=3,top = textGrob("Proportion of label in each image",
                                            gp = gpar(fontsize = 16))) 

Problem 1c EDA

labels<- factor(total_df$label)
cor(total_df[,c(-12)])
##              y_coordinate x_coordinate        label       NDAI         SD
## y_coordinate  1.000000000 -0.006376002 -0.213372321 -0.3276781 -0.2646997
## x_coordinate -0.006376002  1.000000000 -0.465822566 -0.5231960 -0.3233889
## label        -0.213372321 -0.465822566  1.000000000  0.6169346  0.2954477
## NDAI         -0.327678096 -0.523195958  0.616934624  1.0000000  0.6310626
## SD           -0.264699672 -0.323388894  0.295447745  0.6310626  1.0000000
## CORR         -0.254309102 -0.361793127  0.444059231  0.4034998  0.2968385
## DF_angle      0.378983970  0.081274648  0.006550085 -0.1610916 -0.2061691
## CF_angle      0.472777594  0.269869879 -0.208279170 -0.3622113 -0.3688601
## BF_angle      0.520812966  0.339539816 -0.337948500 -0.4629301 -0.4404404
## AF_angle      0.530441793  0.368160603 -0.389741017 -0.4927484 -0.4555423
## AN_angle      0.515677887  0.383211616 -0.389358825 -0.4895267 -0.4466229
##                    CORR     DF_angle   CF_angle   BF_angle   AF_angle
## y_coordinate -0.2543091  0.378983970  0.4727776  0.5208130  0.5304418
## x_coordinate -0.3617931  0.081274648  0.2698699  0.3395398  0.3681606
## label         0.4440592  0.006550085 -0.2082792 -0.3379485 -0.3897410
## NDAI          0.4034998 -0.161091626 -0.3622113 -0.4629301 -0.4927484
## SD            0.2968385 -0.206169130 -0.3688601 -0.4404404 -0.4555423
## CORR          1.0000000  0.126283481 -0.1660966 -0.4311043 -0.6039353
## DF_angle      0.1262835  1.000000000  0.8495716  0.6991073  0.5910958
## CF_angle     -0.1660966  0.849571562  1.0000000  0.9119430  0.8216176
## BF_angle     -0.4311043  0.699107255  0.9119430  1.0000000  0.9529627
## AF_angle     -0.6039353  0.591095794  0.8216176  0.9529627  1.0000000
## AN_angle     -0.6820440  0.547609821  0.7727499  0.9043409  0.9706198
##                AN_angle
## y_coordinate  0.5156779
## x_coordinate  0.3832116
## label        -0.3893588
## NDAI         -0.4895267
## SD           -0.4466229
## CORR         -0.6820440
## DF_angle      0.5476098
## CF_angle      0.7727499
## BF_angle      0.9043409
## AF_angle      0.9706198
## AN_angle      1.0000000
corrplot.mixed(cor(total_df[,c(-3, -12)]))

G<- ggplot(total_df)+geom_density(aes(x= NDAI, group= label, color=labels, fill= labels), color= "black",alpha = 0.7)+
  ggtitle("Overlaying histogram of NDAI based on labels")+
  theme_minimal()
 
H<- ggplot(total_df)+geom_density(aes(x= log(SD), group= label, color=labels, fill= labels), color= "black",alpha = 0.7)+
  ggtitle("Overlaying histogram of Log SD based on labels")+
  theme_minimal()

I<- ggplot(total_df)+geom_density(aes(x= CORR, group= label, color=labels, fill= labels), color= "black",alpha = 0.7)+
  ggtitle("Overlaying histogram of CORR based on labels")+
  theme_minimal()

A<- ggplot(total_df)+geom_density(aes(x= DF_angle, group= label, color=labels, fill= labels), color= "black",alpha = 0.7)+
  ggtitle("Overlaying histogram of DF_angle based on labels")+
  theme_minimal()

B<- ggplot(total_df)+geom_density(aes(x= CF_angle, group= label, color=labels, fill= labels), color= "black",alpha = 0.7)+
  ggtitle("Overlaying histogram of CF_angle based on labels")+
  theme_minimal()

C<- ggplot(total_df)+geom_density(aes(x= BF_angle, group= label, color=labels, fill= labels), color= "black",alpha = 0.7)+
  ggtitle("Overlaying histogram of BF_angle based on labels")+
  theme_minimal()

D<-ggplot(total_df)+geom_density(aes(x= AF_angle, group= label, color=labels, fill= labels), color= "black",alpha = 0.7)+
  ggtitle("Overlaying histogram of AF_angle based on labels")+
  theme_minimal()

E<- ggplot(total_df)+geom_density(aes(x= AN_angle, group= label, color=labels, fill= labels), color= "black",alpha = 0.7)+
  ggtitle("Overlaying histogram of AN_angle based on labels")+
  theme_minimal()

plot_grid(B,C,D,E,nrow=2, align="h")

plot_grid(A,G,H,I,nrow=2, align="h")

Problem 2a Data Split

image1<- image1 %>% filter(label !=0)
image2<- image2 %>% filter(label !=0)
image3<- image3 %>% filter(label !=0)
traintest_split<- function(data){
  res<-list()
  trainIndex <- createDataPartition(data$label, p = .8, 
                                  list = FALSE, 
                                  times = 1)
  res$train<- data[ trainIndex,]
  res$test<-  data[-trainIndex,]
  return(res)
}
trainval_split<- function(data){
  res<-list()
  valIndex <- createDataPartition(data$label, p = .2, 
                                  list = FALSE, 
                                  times = 1)
  res$val<- data[ valIndex,]
  res$train<-  data[-valIndex,]
  return(res)
}

##image 1
#Test-train split
train1_label_split1<- traintest_split(image1)$train
# get test from train-test split
test_label_split1<- traintest_split(image1)$test

#Train-val split ---get val
val_label_split1<- trainval_split(train1_label_split1)$val
#Get train
train_label_split1<-trainval_split(train1_label_split1)$train

##image 2
train2_label_split2<- traintest_split(image2)$train
# get test from train-test split
test_label_split2<- traintest_split(image2)$test
#Train-val split ---get val
val_label_split2<- trainval_split(train2_label_split2)$val
#Get train
train_label_split2<-trainval_split(train2_label_split2)$train

##image 3
train3_label_split3<- traintest_split(image3)$train
# get test from train-test split
test_label_split3<- traintest_split(image3)$test

#Train-val split ---get val
val_label_split3<- trainval_split(train3_label_split3)$val
#Get train
train_label_split3<-trainval_split(train3_label_split3)$train

method1_train<-rbind(train_label_split1,train_label_split2,train_label_split3)
method1_val<-rbind(val_label_split1,val_label_split2,val_label_split3)
method1_test<- rbind(test_label_split1,test_label_split2,test_label_split3)
method1_train_val<-rbind(method1_train,method1_val)
method1_train_val$label<-factor(method1_train_val$label)
#Check Duplicates
image1 %>% distinct()
grid_row <- 10
grid_col <- 10
datasplit<- function(grid_row, grid_col, image){
  ret <- list()
  train_data <- data.frame()
  val_data <- data.frame()
  test_data <- data.frame()
  min_y_cor <- min(image$y_coordinate)
  min_x_cor <- min(image$x_coordinate)
  max_y_cor <- max(image$y_coordinate)
  max_x_cor <- max(image$x_coordinate)
  divide_row <- seq(min_y_cor,max_y_cor+1,(max_y_cor+1-min_y_cor)/(grid_row-1))
  divide_col <- seq(min_x_cor,max_x_cor+1,(max_x_cor+1-min_x_cor)/(grid_col-1))
  for (i in 1:length(divide_row)){
    for (j in 1:length(divide_col)){
      chunk <- image[image$y_coordinate>=divide_row[i] & image$y_coordinate<divide_row[i+1] & 
                       image$x_coordinate>=divide_col[j] & image$x_coordinate<divide_col[j+1], ]
      test_and_val <- floor(nrow(chunk)*0.4)
      test_and_val_ind <- sample(seq_len(nrow(chunk)), size = test_and_val)
      val_size <- floor(test_and_val/2)
      val_ind_ind <- sample(length(test_and_val_ind), size = val_size)
      val_ind <- test_and_val_ind[val_ind_ind]
      test_ind <- test_and_val_ind[-val_ind_ind]
      test<- chunk[test_ind, ]
      val<- chunk[val_ind, ]
      train<- chunk[-test_and_val_ind, ]
      train_data = rbind(train_data,train)
      test_data = rbind(test_data,test)
      val_data = rbind(val_data,val)
    }
  }
  ret$training <- train_data
  ret$validation <- val_data
  ret$test <- test_data
  return(ret)
}

split_result_1 <- datasplit(10,10,image1)
split_result_2 <- datasplit(10,10,image2)
split_result_3 <- datasplit(10,10,image3)
train_total<- rbind(split_result_1$training,split_result_2$training,split_result_3$training)
val_total<- rbind(split_result_1$validation,split_result_2$validation,split_result_3$validation)
test_total<- rbind(split_result_1$test,split_result_2$test,split_result_3$test)
train_total$picture<- NULL
val_total$picture<- NULL
test_total$picture<-NULL
train_total$label<-factor(train_total$label)

Problem 2b Baseline

test_trivial<- test_total
test_trivial$label<--1
train_trivial<- train_total
train_trivial$label<--1
val_trivial<- val_total
val_trivial$label<--1
sum(test_trivial$label ==test_total$label)/length(test_total$label)
## [1] 0.6104143
sum(val_trivial$label ==val_total$label)/length(val_trivial$label)
## [1] 0.6107753
sum(train_trivial$label ==train_total$label)/length(train_trivial$label)
## [1] 0.6109172

Problem 2c First Order Importance

train_val<- rbind(train_total, val_total)
train_val$label<- as.factor(train_val$label)
test_total$label<- as.factor(test_total$label)
#visual
A<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= y_coordinate, color= factor(label)))+ theme_bw()+ ggtitle("y_coordinate and label")+
  xlab("label")+theme(plot.title = element_text(size = rel(1), vjust = 1.5,face="bold.italic"))+
  theme(legend.position="none")
B<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= x_coordinate,color= factor(label)))+ theme_bw()+ ggtitle("x_coordinate and label")+theme(legend.position="none")+
  xlab("label")+theme(plot.title = element_text(size = rel(1), vjust = 1.5,face="bold.italic"))
C<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= NDAI,color= factor(label)))+ theme_bw()+ 
  ggtitle("label and NDAI")+theme(legend.position="none")+
  xlab("label")+theme(plot.title = element_text(size = rel(1), vjust = 1.5,face="bold.italic"))
D<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= SD,color= factor(label)))+ theme_bw()+ 
  ggtitle("label and SD")+theme(legend.position="none")+
  xlab("label")+theme(plot.title = element_text(size = rel(1), vjust = 1.5,face="bold.italic"))
E<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= CORR,color= factor(label)))+ theme_bw()+ 
  ggtitle("label and CORR")+theme(legend.position="none")+
  xlab("label")+theme(plot.title = element_text(size = rel(1), vjust = 1.5,face="bold.italic"))
G<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= DF_angle,color= factor(label)))+ theme_bw()+ theme(legend.position="none")+
  ggtitle("label and DF_angle")+
  xlab("label")+theme(plot.title = element_text(size = rel(1), vjust = 1.5,face="bold.italic"))

H<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= CF_angle,color= factor(label)))+ theme_bw()+ theme(legend.position="none")+
  ggtitle("label and CF_angle")+
  xlab("label")+theme(plot.title = element_text(size = rel(1), vjust = 1.5,face="bold.italic"))
I<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= BF_angle,color= factor(label)))+ theme_bw()+ theme(legend.position="none")+
  ggtitle("label and BF_angle")+
  xlab("label")+theme(plot.title = element_text(size = rel(1), vjust = 1.5,face="bold.italic"))
J<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= AF_angle,color= factor(label)))+ theme_bw()+ theme(legend.position="none")+
  ggtitle("label and AF_angle")+
  xlab("label")+theme(plot.title = element_text(size = rel(1), vjust = 1.5,face="bold.italic"))
K<-I<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= AN_angle,color= factor(label)))+ theme_bw()+ 
  ggtitle("label and AN_angle")+theme(legend.position="none")+
  xlab("label")+theme(plot.title = element_text(size = rel(1), vjust = 1.5,face="bold.italic"))
 
plot_grid(A, B,C,D,E,G,H,I,J,K,nrow=3, align="h")

#quantitative
summary(lm(as.numeric(label)~NDAI, data = train_val))$r.squared
## [1] 0.5759593
summary(lm(as.numeric(label)~CORR, data = train_val))$r.squared
## [1] 0.3043421
summary(lm(as.numeric(label)~AF_angle, data = train_val))$r.squared
## [1] 0.2584373
summary(lm(as.numeric(label)~x_coordinate, data = train_val))$r.squared
## [1] 0.3259248
summary(lm(as.numeric(label)~y_coordinate, data = train_val))$r.squared
## [1] 0.07891846
summary(lm(as.numeric(label)~SD, data = train_val))$r.squared
## [1] 0.1907891
summary(lm(as.numeric(label)~DF_angle, data = train_val))$r.squared
## [1] 9.926802e-05
summary(lm(as.numeric(label)~CF_angle, data = train_val))$r.squared
## [1] 0.08043943
summary(lm(as.numeric(label)~AN_angle, data = train_val))$r.squared
## [1] 0.255765

Problem 3a

Model 1: Logistic Regression

library("dvmisc")
## Warning: package 'dvmisc' was built under R version 3.5.2
## Loading required package: rbenchmark
#logistic regression
seed = 123
K = 5
source("CVgeneric.R")
cv_result <- CVgeneric("logistic", c("y_coordinate", "x_coordinate", "NDAI", "SD", "CORR", "DF_angle", "CF_angle", "BF_angle", "AF_angle", 'AN_angle'), "label", K, data=train_val, loss= accuracy, seed)
set.seed(cv_result$seed)
cv_result_method_2 <- CVgeneric("logistic", c("y_coordinate", "x_coordinate", "NDAI", "SD", "CORR", "DF_angle", "CF_angle", "BF_angle", "AF_angle", 'AN_angle'), "label", K, data=method1_train_val, loss= accuracy, seed)

set.seed(cv_result$seed)

#folds = createFolds(train_val[,labels], k = cv_result$num_of_folds)
train_data = train_val[-cv_result$index, ]
logistic_model<- train(label ~ .,  data=train_data, method="glm", family="binomial")

#test accuracy
predicted.classes <- logistic_model %>% predict(test_total[,-3])
mean(predicted.classes == test_total$label)
## [1] 0.8959295
library("MLmetrics")
## 
## Attaching package: 'MLmetrics'
## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE
## The following object is masked from 'package:base':
## 
##     Recall
#Other error metrics
#F1 Score
F1_Score(as.numeric(predicted.classes), as.numeric(test_total$label))
## [1] 0.9142868
#Sensitivity
Sensitivity(as.numeric(predicted.classes), as.numeric(test_total$label))
## [1] 0.9193266
# After inspecting the model importance, we take out the x and y coordinate and fit the logistic model again 
predictions<- data.frame(predict(logistic_model, newdata =test_total[,-3],
                                 type= "prob" ))
colnames(predictions) <- c(-1,1)
pred <- prediction(predictions$`1`, test_total$label)
perf <- performance(pred, "acc")
plot(perf, avg= "vertical", spread.estimate="boxplot",
     show.spread.at= seq(0.1, 0.9, by=0.1), 
     main="Logistic regression cutoff and performance tradeoff")

# The label 1 takes 40%, now I have the data set....
index<-which.max(slot(perf, "y.values")[[1]])
max<-slot(perf, "x.values")[[1]][index]

perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=T, print.cutoffs.at=c(max),
     text.adj=c(1.5,0.2),
     avg="threshold", lwd=3,
     main= "Roc curve for logistic regression")
abline(a=0,b=1)

#### Model 2: LDA

source("CVgeneric.R")
cv_result_lda<-CVgeneric("lda", c("y_coordinate", "x_coordinate", "NDAI", "SD", "CORR", "DF_angle", "CF_angle", "BF_angle", "AF_angle", 'AN_angle'), "label", K = 5, train_val, loss = precision, seed)
cv_result_lda_method_2<-CVgeneric("lda", c("y_coordinate", "x_coordinate", "NDAI", "SD", "CORR", "DF_angle", "CF_angle", "BF_angle", "AF_angle", 'AN_angle'), "label", K = 5, method1_train_val, loss = precision, seed)
train_data = train_val[-cv_result_lda$index, ]
lda.model = lda(factor(label)~., data=train_val)
lda_pred<-predict(lda.model, newdata=test_total[,-3])
lda_pre2<-predict(lda.model, newdata=test_total[,-3], type=  "prob")
  
#Other error metrics
#F1 Score
F1_Score(as.numeric(lda_pred$class), as.numeric(test_total$label))
## [1] 0.9164573
#Sensitivity
Sensitivity(as.numeric(lda_pred$class), as.numeric(test_total$label))
## [1] 0.9285281
#Accuracy
mean(lda_pred$class == test_total$label)
## [1] 0.8993176
predictions<- data.frame(predict(lda.model, newdata =test_total[,-3] ))
colnames(predictions) <- c("label",-1,1)

pred <- prediction(predictions$`1`, test_total$label)
perf <- performance(pred, "acc")
plot(perf, avg= "vertical", spread.estimate="boxplot", show.spread.at= seq(0.1, 0.9, by=0.1), main="LDA cutoff and performance tradeoff")

# The label 1 takes 40%, now I have the data set....
index<-which.max(slot(perf, "y.values")[[1]])
max<-slot(perf, "x.values")[[1]][index]

perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=T, print.cutoffs.at=c(max), text.adj=c(1,0), avg="threshold", lwd=3, main= "LDA curve for ROC")
abline(a=0,b=1)

Model 3: Decision Tree

method1_train_val$picture<-NULL
# Load CART packages
seed = 123
K = 5
cv_result_decision_tree <- CVgeneric("decision tree", c("y_coordinate", "x_coordinate", "NDAI", "SD", "CORR", "DF_angle", "CF_angle", "BF_angle", "AF_angle", 'AN_angle'), "label", K, data=train_val, loss= accuracy, seed)
 
train_data = train_val[-cv_result_decision_tree$index, ]

tree = rpart(label ~ ., data=train_data,maxdepth =10, minsplit= 10)
tree_no_xy = rpart(label ~ NDAI+SD+CORR+DF_angle+CF_angle+BF_angle+AF_angle+AN_angle , data=train_data,maxdepth =10, minsplit= 10)
fancyRpartPlot(tree_no_xy)

tree.pred = predict(tree, newdata=test_total[,-3],method = 'response')
tree.pred2 = predict(tree, newdata=test_total[,-3],type = 'class')

mean(tree.pred2==test_total$label)
## [1] 0.922674
fancyRpartPlot(tree)

#Other error metrics
#F1 Score
F1_Score(as.numeric(tree.pred2), as.numeric(test_total$label))
## [1] 0.9341357
#Sensitivity
Sensitivity(as.numeric(tree.pred2), as.numeric(test_total$label))
## [1] 0.9729269
predictions<- data.frame(predict(tree, newdata =test_total[,-3], type= "prob" ))
colnames(predictions) <- c(-1,1)

pred <- prediction(predictions$`1`, test_total$label)
perf <- performance(pred, "acc")
plot(perf, avg= "vertical", spread.estimate="boxplot", 
     show.spread.at= seq(0.1, 0.9, by=0.1), 
     main="Decision Tree cutoff and performance tradeoff")

index<-which.max(slot(perf, "y.values")[[1]])
max<-slot(perf, "x.values")[[1]][index]
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=T, print.cutoffs.at=max, text.adj=c(1,0), avg="threshold", 
     lwd=3, 
     main= "Roc curve for Decision Tree")
abline(a=0,b=1)

tree_importance<-as.data.frame(varImp(tree))
tree_importance$importance<-rownames(tree_importance)
tree_importance <- tree_importance %>% arrange(desc(Overall))

#CV method 2
cv_result_decision_tree_method2 <- CVgeneric("decision tree", c("y_coordinate", "x_coordinate", "NDAI", "SD", "CORR", "DF_angle", "CF_angle", "BF_angle", "AF_angle", 'AN_angle'), "label", K, data=method1_train_val, loss= accuracy, seed)
train_data_tree_2 = method1_train_val[-cv_result_decision_tree_method2$index, ]
tree2 = rpart(label ~ ., data=train_data_tree_2,maxdepth =10, minsplit= 10)
tree.pred_method_2 <- predict(tree2, newdata=test_total[,-3],type = 'class')

Model 4:Random Forest

seed = 123
K = 5
#CV method 1
cv_result_rf <- CVgeneric("random forest", c("y_coordinate", "x_coordinate", "NDAI", "SD", "CORR", "DF_angle", "CF_angle", "BF_angle", "AF_angle", 'AN_angle'), "label", K, data=train_val, loss= accuracy, seed)
train_data_rf = train_val[-cv_result_rf$index, ]

#Choose the best hyperparameter of ntry which is the  Number of variables randomly sampled as candidates at each split.
Random_forest_hyperparameter_tune<-tuneRF(x =train_data[,-3],
              y = as.factor(train_data[,3]),
              ntreeTry = 50)
## mtry = 3  OOB error = 0.46% 
## Searching left ...
## mtry = 2     OOB error = 0.58% 
## -0.2536585 0.05 
## Searching right ...
## mtry = 6     OOB error = 0.36% 
## 0.2097561 0.05 
## mtry = 10    OOB error = 0.41% 
## -0.117284 0.05

plot(data.frame(Random_forest_hyperparameter_tune)$mtry,
     data.frame(Random_forest_hyperparameter_tune)$OOBError, 
     type = "l",ylab="OOBError",xlab="mtry", 
     main = "Random Forest mtry tune")

#Hyperparameter in R for random Forest
#ntree: Number of trees to grow.

rf_model <- randomForest(as.factor(label) ~ y_coordinate + x_coordinate +
                           NDAI + SD + CORR + DF_angle + CF_angle + BF_angle +
                           AF_angle + AN_angle, data = train_data_rf,
                         importance = TRUE,
                         ntree=50,
                         ntry=6,
                         maxnodes= 500, 
                         nodesize = 10)
predicted.classes_rf <- rf_model %>% predict(test_total[,-3])
mean(predicted.classes_rf == test_total$label)
## [1] 0.9856546
#Other error metrics
#F1 Score
F1_Score(as.numeric(predicted.classes_rf), as.numeric(test_total$label))
## [1] 0.988177
#Sensitivity
Sensitivity(as.numeric(predicted.classes_rf), as.numeric(test_total$label))
## [1] 0.994301
predictions<- data.frame(predict(rf_model, newdata =test_total[,-3], type= "prob" ))
colnames(predictions) <- c(-1,1)

pred <- prediction(predictions$`1`, test_total$label)
perf <- performance(pred, "acc")
plot(perf, avg= "vertical", spread.estimate="boxplot", 
     show.spread.at= seq(0.1, 0.9, by=0.1), 
     main="Random Foret cutoff and performance tradeoff")

# The label 1 takes 40%, now I have the data set....
index<-which.max(slot(perf, "y.values")[[1]])
max<-slot(perf, "x.values")[[1]][index]

perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=T, print.cutoffs.at=c(max), text.adj=c(0.5,0), 
     avg="threshold", lwd=3, main= "Roc curve for Random Forest")
abline(a=0,b=1)

varImp(rf_model)
##                     -1         1
## y_coordinate 23.939252 23.939252
## x_coordinate 27.422301 27.422301
## NDAI         26.462382 26.462382
## SD            8.316239  8.316239
## CORR         16.227750 16.227750
## DF_angle     11.040074 11.040074
## CF_angle     10.558653 10.558653
## BF_angle      8.243116  8.243116
## AF_angle      7.569437  7.569437
## AN_angle     10.725195 10.725195
#CV method 2
cv_result_rf_method2 <- CVgeneric("random forest", 
                                  c("y_coordinate", "x_coordinate", "NDAI", "SD", 
                                    "CORR", "DF_angle", "CF_angle", "BF_angle", 
                                    "AF_angle", 'AN_angle'), 
                                  "label", K, data=method1_train_val, 
                                  loss= accuracy, 
                                  seed)
#Print accuracy result
#cv_result_rf_method2
#train data by using the best training sets
train_data_rf2 = method1_train_val[-cv_result_rf_method2$index, ]

rf_model_method_2 <- randomForest(as.factor(label) ~ y_coordinate + 
                                    x_coordinate + NDAI + SD + CORR +
                                    DF_angle + CF_angle + BF_angle +AF_angle + 
                                    AN_angle,
                                  data = train_data_rf2, 
                                  importance = TRUE,ntree=50,
                                  ntry=6,maxnodes= 500, nodesize = 10)

predicted.classes_rf_method_2 <- rf_model_method_2 %>% predict(test_total[,-3])

Problem 4 Diagnostics

4a

#Even though decision tree does not give the best train/test accuracy, it's a good method of classification due to its clearity in procudure. 

#Even though the cross-validation accuracy showed that the there is no overfitting in the training set, we show the concept of tree prune here which is a method used in decisino tree to balance the cost complexity and model outcome. 
#We can prune the tree using a cost complexity prune of 0.02
ggplot(data=tree_importance, aes(x=reorder(importance, -Overall), y=Overall)) +
  geom_bar(stat="identity", color="black", fill="lightgreen")+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  xlab("features")+
  ylab("importance")+
  ggtitle("Feature importance in Decision Tree")

fit <- rpart(label~.,
   method="anova", data=train_val)
printcp(fit) # display the results 
## 
## Regression tree:
## rpart(formula = label ~ ., data = train_val, method = "anova")
## 
## Variables actually used in tree construction:
## [1] AN_angle     CORR         NDAI         x_coordinate
## 
## Root node error: 39564/166443 = 0.23771
## 
## n= 166443 
## 
##         CP nsplit rel error  xerror      xstd
## 1 0.643034      0   1.00000 1.00001 0.0011149
## 2 0.034628      1   0.35697 0.35707 0.0022032
## 3 0.021174      2   0.32234 0.32249 0.0021659
## 4 0.020619      3   0.30116 0.30698 0.0020901
## 5 0.015749      4   0.28055 0.28105 0.0018821
## 6 0.010000      5   0.26480 0.26543 0.0019129
plotcp(fit)

summary(fit) 
## Call:
## rpart(formula = label ~ ., data = train_val, method = "anova")
##   n= 166443 
## 
##           CP nsplit rel error    xerror        xstd
## 1 0.64303417      0 1.0000000 1.0000117 0.001114925
## 2 0.03462778      1 0.3569658 0.3570734 0.002203184
## 3 0.02117380      2 0.3223381 0.3224903 0.002165878
## 4 0.02061901      3 0.3011642 0.3069791 0.002090116
## 5 0.01574880      4 0.2805452 0.2810524 0.001882125
## 6 0.01000000      5 0.2647964 0.2654345 0.001912880
## 
## Variable importance
##         NDAI           SD         CORR     AN_angle     AF_angle 
##           25           18           15           15           13 
##     BF_angle x_coordinate     CF_angle     DF_angle 
##           12            1            1            1 
## 
## Node number 1: 166443 observations,    complexity param=0.6430342
##   mean=1.389118, MSE=0.2377052 
##   left son=2 (90754 obs) right son=3 (75689 obs)
##   Primary splits:
##       NDAI         < 0.7947012  to the left,  improve=0.6430342, (0 missing)
##       SD           < 3.080932   to the left,  improve=0.4653290, (0 missing)
##       CORR         < 0.1966437  to the left,  improve=0.4485266, (0 missing)
##       x_coordinate < 229.5      to the right, improve=0.2647618, (0 missing)
##       AN_angle     < 177.6024   to the right, improve=0.2641968, (0 missing)
##   Surrogate splits:
##       SD       < 3.134231   to the left,  agree=0.868, adj=0.709, (0 split)
##       CORR     < 0.1949229  to the left,  agree=0.807, adj=0.576, (0 split)
##       AN_angle < 181.8939   to the right, agree=0.783, adj=0.522, (0 split)
##       AF_angle < 186.9681   to the right, agree=0.763, adj=0.478, (0 split)
##       BF_angle < 245.0755   to the right, agree=0.747, adj=0.443, (0 split)
## 
## Node number 2: 90754 observations,    complexity param=0.0211738
##   mean=1.032076, MSE=0.03104687 
##   left son=4 (89546 obs) right son=5 (1208 obs)
##   Primary splits:
##       AN_angle < 172.2778   to the right, improve=0.2973169, (0 missing)
##       AF_angle < 177.8793   to the right, improve=0.1943166, (0 missing)
##       NDAI     < 0.3929783  to the left,  improve=0.1777455, (0 missing)
##       CORR     < 0.2075311  to the left,  improve=0.1707469, (0 missing)
##       BF_angle < 190.1277   to the right, improve=0.1378417, (0 missing)
##   Surrogate splits:
##       AF_angle     < 178.8896   to the right, agree=0.996, adj=0.667, (0 split)
##       BF_angle     < 191.1332   to the right, agree=0.992, adj=0.410, (0 split)
##       CF_angle     < 197.68     to the right, agree=0.989, adj=0.205, (0 split)
##       DF_angle     < 209.0842   to the right, agree=0.988, adj=0.104, (0 split)
##       y_coordinate < 21.5       to the right, agree=0.988, adj=0.065, (0 split)
## 
## Node number 3: 75689 observations,    complexity param=0.03462778
##   mean=1.817226, MSE=0.1493678 
##   left son=6 (6902 obs) right son=7 (68787 obs)
##   Primary splits:
##       x_coordinate < 295.5      to the right, improve=0.12118230, (0 missing)
##       CORR         < 0.1938472  to the left,  improve=0.11668150, (0 missing)
##       y_coordinate < 45.5       to the left,  improve=0.07704374, (0 missing)
##       DF_angle     < 261.4637   to the left,  improve=0.06366120, (0 missing)
##       SD           < 2.742566   to the left,  improve=0.05255161, (0 missing)
##   Surrogate splits:
##       AN_angle < 263.4902   to the right, agree=0.909, adj=0.003, (0 split)
##       AF_angle < 277.8265   to the right, agree=0.909, adj=0.003, (0 split)
##       CORR     < -0.3280679 to the left,  agree=0.909, adj=0.000, (0 split)
##       BF_angle < 300.493    to the right, agree=0.909, adj=0.000, (0 split)
## 
## Node number 4: 89546 observations
##   mean=1.020917, MSE=0.02047912 
## 
## Node number 5: 1208 observations
##   mean=1.859272, MSE=0.120924 
## 
## Node number 6: 6902 observations,    complexity param=0.0157488
##   mean=1.392495, MSE=0.2384427 
##   left son=12 (4542 obs) right son=13 (2360 obs)
##   Primary splits:
##       AN_angle     < 195.8497   to the left,  improve=0.3786106, (0 missing)
##       AF_angle     < 211.1927   to the left,  improve=0.3404610, (0 missing)
##       BF_angle     < 230.3768   to the left,  improve=0.3158916, (0 missing)
##       y_coordinate < 66.5       to the left,  improve=0.2906732, (0 missing)
##       CF_angle     < 249.0817   to the left,  improve=0.2870195, (0 missing)
##   Surrogate splits:
##       AF_angle     < 209.1577   to the left,  agree=0.946, adj=0.841, (0 split)
##       BF_angle     < 230.3455   to the left,  agree=0.908, adj=0.731, (0 split)
##       CF_angle     < 249.1973   to the left,  agree=0.871, adj=0.622, (0 split)
##       y_coordinate < 77.5       to the left,  agree=0.843, adj=0.540, (0 split)
##       DF_angle     < 264.0044   to the left,  agree=0.841, adj=0.536, (0 split)
## 
## Node number 7: 68787 observations,    complexity param=0.02061901
##   mean=1.859843, MSE=0.1205132 
##   left son=14 (24633 obs) right son=15 (44154 obs)
##   Primary splits:
##       CORR     < 0.1990036  to the left,  improve=0.09840812, (0 missing)
##       SD       < 2.615267   to the left,  improve=0.04340106, (0 missing)
##       DF_angle < 292.9841   to the left,  improve=0.04142987, (0 missing)
##       AF_angle < 227.3604   to the right, improve=0.04092657, (0 missing)
##       NDAI     < 3.27511    to the right, improve=0.03594479, (0 missing)
##   Surrogate splits:
##       AF_angle     < 224.184    to the right, agree=0.716, adj=0.207, (0 split)
##       AN_angle     < 206.0282   to the right, agree=0.716, adj=0.206, (0 split)
##       SD           < 3.828523   to the left,  agree=0.703, adj=0.170, (0 split)
##       DF_angle     < 280.5912   to the left,  agree=0.692, adj=0.141, (0 split)
##       y_coordinate < 276.5      to the right, agree=0.685, adj=0.120, (0 split)
## 
## Node number 12: 4542 observations
##   mean=1.175914, MSE=0.1449681 
## 
## Node number 13: 2360 observations
##   mean=1.809322, MSE=0.1543199 
## 
## Node number 14: 24633 observations
##   mean=1.714042, MSE=0.204186 
## 
## Node number 15: 44154 observations
##   mean=1.941183, MSE=0.05535744
rsq.rpart(fit)
## 
## Regression tree:
## rpart(formula = label ~ ., data = train_val, method = "anova")
## 
## Variables actually used in tree construction:
## [1] AN_angle     CORR         NDAI         x_coordinate
## 
## Root node error: 39564/166443 = 0.23771
## 
## n= 166443 
## 
##         CP nsplit rel error  xerror      xstd
## 1 0.643034      0   1.00000 1.00001 0.0011149
## 2 0.034628      1   0.35697 0.35707 0.0022032
## 3 0.021174      2   0.32234 0.32249 0.0021659
## 4 0.020619      3   0.30116 0.30698 0.0020901
## 5 0.015749      4   0.28055 0.28105 0.0018821
## 6 0.010000      5   0.26480 0.26543 0.0019129

Diagnostics 4b

#refer section 4d

Diagnostics 4c AdaBoost

adaboost<- adaboost(label~y_coordinate + x_coordinate  + NDAI + SD + CORR + DF_angle + CF_angle+ BF_angle + AF_angle + AN_angle ,data=train_val, nIter=10)
predictions<- data.frame(predict(adaboost, newdata =test_total[,-3] )$prob)
colnames(predictions) <- c(-1,1)
mean(predict(adaboost, newdata =test_total[,-3] )$class == test_total$label)
## [1] 0.9977893
pred <- prediction(predictions$`1`, test_total$label)
perf <- performance(pred, "acc")
plot(perf, avg= "vertical", spread.estimate="boxplot", show.spread.at= seq(0.1, 0.9, by=0.1), main="Adaboosting cutoff and performance tradeoff")

# The label 1 takes 40%, now I have the data set....
index<-which.max(slot(perf, "y.values")[[1]])
max<-slot(perf, "x.values")[[1]][index]
 
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=T, print.cutoffs.at=c(max), text.adj=c(0.3,0), avg="threshold", lwd=3, main= "Adaboosting curve for ROC")
abline(a=0,b=1)

Diagnostics 4d

#Misclassification pattern for random forest 
mis_label_random_forest<- train_data_rf[predicted.classes_rf!=test_total$label,]
ggplot(mis_label_random_forest)+ geom_point(alpha = 0.5, aes(x= x_coordinate, y = y_coordinate, color = factor(label)))+
  xlab(" x coordinate") + ylab(" y coordinate") + ggtitle("Mis-classification in random-forest model split method 1 ")+ theme_bw()

Predicted <-predicted.classes_rf
Actual<- test_total$label
fourfoldplot(table(Predicted, Actual))

#Method 2
mis_label_random_forest<- train_data_rf2[predicted.classes_rf_method_2!=test_total$label,]
ggplot(mis_label_random_forest)+ geom_point(alpha = 0.5, aes(x= x_coordinate, y = y_coordinate, color = factor(label)))+
  xlab(" x coordinate") + ylab(" y coordinate") + ggtitle("Mis-classification in random-forest model split method 2 ")+ theme_bw()

Predicted <-predicted.classes_rf_method_2
fourfoldplot(table(Predicted, Actual))

#Misclassification pattern for decision tree
mis_label_decision_tree<- train_data[tree.pred2!=test_total$label,]
ggplot(mis_label_decision_tree)+ geom_point(alpha = 0.5, aes(x= x_coordinate, y = y_coordinate, color = factor(label)))+
  xlab(" x coordinate") + ylab(" y coordinate") + ggtitle("Mis-classification in decision tree model split method 1 ")+ theme_bw()

Predicted <-tree.pred2
fourfoldplot(table(Predicted, Actual))

#Method 2
mis_label_decision_tree2<- train_data_tree_2[tree.pred_method_2!=test_total$label,]
ggplot(mis_label_decision_tree2)+ geom_point(alpha = 0.5, aes(x= x_coordinate, y = y_coordinate, color = factor(label)))+
  xlab(" x coordinate") + ylab(" y coordinate") + ggtitle("Mis-classification in decision tree model split method 2 ")+ theme_bw()

Predicted <-tree.pred_method_2
fourfoldplot(table(Predicted, Actual))